library(tidyverse)
library(tidytext)
library(here)
library(lexiconPT)
library(plotly)
theme_set(theme_bw())
reclamacoes = read_csv(here("data/3-avaliacao-humana/reclamacoes-avaliadas-20190515.csv"))
Parsed with column specification:
cols(
id = [32mcol_double()[39m,
orgao = [31mcol_character()[39m,
data = [31mcol_character()[39m,
titulo = [31mcol_character()[39m,
texto = [31mcol_character()[39m,
link = [31mcol_character()[39m,
`Grupo que vai avaliar` = [32mcol_double()[39m,
insatisfacao = [32mcol_double()[39m,
avaliadores = [32mcol_double()[39m,
range.avaliacoes = [32mcol_double()[39m
)
#reclamacoes = reclamacoes_raw %>%
# mutate(
# nome_orgao_site = orgao,
# orgao = str_split(link, "/") %>% map_chr(~ .[[5]])
# ) %>%
# filter(orgao %in% c("inss-ministerio-da-previdencia-social", #"anac-agencia-nacional-de-aviacao-civil")) %>%
# mutate(id = 1:n(),
# grupo_avaliando = id %% 6 + 1)
O processo de estimativa sera muito baseado em https://sillasgonzaga.github.io/2017-09-23-sensacionalista-pt01/ .
data("oplexicon_v3.0")
data("sentiLex_lem_PT02")
op30 <- oplexicon_v3.0
sent <- sentiLex_lem_PT02
glimpse(op30)
Observations: 32,191
Variables: 4
$ term [3m[38;5;246m<chr>[39m[23m "=[", "=@", "=p", "=P", "=x", "=d", "=D", ";)", ";)", ";@", ";*", ";**", ";~", ";D", ";D", ";p",…
$ type [3m[38;5;246m<chr>[39m[23m "emot", "emot", "emot", "emot", "emot", "emot", "emot", "emot", "emot", "emot", "emot", "emot", …
$ polarity [3m[38;5;246m<int>[39m[23m -1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1…
$ polarity_revision [3m[38;5;246m<chr>[39m[23m "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "…
Precisamos de um dataframe onde cada observacao eh uma palavra.
palavra_a_palavra = reclamacoes %>%
select(id, texto) %>%
unnest_tokens(termo, texto)
palavra_a_palavra %>%
select(id, termo) %>%
head(20)
palavras_com_sentimento = palavra_a_palavra %>%
left_join(op30 %>% select(term, op30 = polarity), by = c("termo" = "term")) %>%
left_join(sent %>% select(term, sent = polarity), by = c("termo" = "term"))
Agora, de fato, calculamos qual a polaridade acumulada (via somatorio) de cada reclamacao e salvamos em um csv.
sentimentos = palavras_com_sentimento %>%
group_by(id) %>%
summarise(sentimento_op30 = sum(op30, na.rm = TRUE) *-1,
palavras_op30 = sum(!is.na(op30)),
sentimento_sent = sum(sent, na.rm = TRUE) *-1,
palavras_sent = sum(!is.na(sent)),
palavras = n())
sentimentos %>%
write_csv(here("data/5-sentimentos/sentimento.csv"))
x <- sentimentos[2]
normalized_op = ((x-min(x))/(max(x)-min(x)) * 4) + 1
y <- sentimentos[4]
normalized_sent = ((y-min(y))/(max(y)-min(y)) * 4) + 1
reclamacoes <- reclamacoes %>% mutate(sentimentos_op = normalized_op$sentimento_op30)
reclamacoes <- reclamacoes %>% mutate(sentimentos_sent = normalized_sent$sentimento_sent)
#linearModOp <- lm(insatisfacao ~ sentimentos_op, data=reclamacoes)
#linearModSent <- lm(insatisfacao ~ sentimentos_sent, data=reclamacoes)
#cor_op <- cor(reclamacoes$sentimentos_op, reclamacoes$insatisfacao)
#cor_sent <- cor(reclamacoes$sentimentos_sent, reclamacoes$insatisfacao)
p <- plot_ly(reclamacoes, x = ~insatisfacao, y = ~sentimentos_op, type = 'scatter')
p
No scatter mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
No scatter mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
q <- plot_ly(reclamacoes, x = ~insatisfacao, y = ~sentimentos_sent, type = 'scatter')
q
No scatter mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
No scatter mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode